The data is downloaded from Kaggle and contains data about houses and around ~80 variables. The goal is to use these explanatory variables to predict the House Prices. Here, we are dealing with a regression problem. As we want to conduct a classification, as well, we will also divide the SalePrice in three categories, namely "low", "middle" and "upper class". We will then attempt to predict these 3 classes.
https://www.kaggle.com/c/house-prices-advanced-regression-techniques
This notebook is to explore the data, to understand the basic relationships between the variables and to get a feeling about which variables might be good predictors for the House prices. There will be a separate notebook containing statistical and machine learning models for the predictions.
Author: Julia Hammerer, Vanessa Mai Last Changes: 18.11.2018
import sys
sys.path.insert(0, '../helper/')
# load packages
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_profiling
import missingno as msno
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import warnings
import math
from pandas.tools.plotting import table
from plotly.offline import init_notebook_mode
from plotly.offline import iplot
from plotly.offline import plot
from scipy.stats import mannwhitneyu
from statsmodels.distributions.empirical_distribution import ECDF
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import norm
from helper import na_ratio_table
from helper import corr_heatmap
from helper import corr_matrix_1
C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas_profiling\plot.py:15: UserWarning:
This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.
The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\__main__.py", line 3, in <module>
app.launch_new_instance()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
app.start()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
self.io_loop.start()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
self.asyncio_loop.run_forever()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 422, in run_forever
self._run_once()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 1434, in _run_once
handle._run()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\asyncio\events.py", line 145, in _run
self._callback(*self._args)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
ret = callback()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 1233, in inner
self.run()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 1147, in run
yielded = self.gen.send(value)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 370, in dispatch_queue
yield self.process_one()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 346, in wrapper
runner = Runner(result, future, yielded)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 1080, in __init__
self.run()
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 1147, in run
yielded = self.gen.send(value)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
yield gen.maybe_future(dispatch(*args))
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 326, in wrapper
yielded = next(result)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 326, in wrapper
yielded = next(result)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
user_expressions, allow_stdin,
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py", line 326, in wrapper
yielded = next(result)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2817, in run_cell
raw_cell, store_history, silent, shell_futures)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2843, in _run_cell
return runner(coro)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
coro.send(None)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3018, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3183, in run_ast_nodes
if (yield from self.run_code(code, result)):
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3265, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-bafafaff196f>", line 2, in <module>
get_ipython().run_line_magic('matplotlib', 'inline')
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2285, in run_line_magic
result = fn(*args,**kwargs)
File "<decorator-gen-108>", line 2, in matplotlib
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\magic.py", line 187, in <lambda>
call = lambda f, *a, **k: f(*a, **k)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\magics\pylab.py", line 99, in matplotlib
gui, backend = self.shell.enable_matplotlib(args.gui)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3353, in enable_matplotlib
pt.activate_matplotlib(backend)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\pylabtools.py", line 314, in activate_matplotlib
matplotlib.pyplot.switch_backend(backend)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\matplotlib\pyplot.py", line 231, in switch_backend
matplotlib.use(newbackend, warn=False, force=True)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\matplotlib\__init__.py", line 1410, in use
reload(sys.modules['matplotlib.backends'])
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\importlib\__init__.py", line 166, in reload
_bootstrap._exec(spec, module)
File "C:\Users\maiv2\AppData\Local\Continuum\anaconda3\lib\site-packages\matplotlib\backends\__init__.py", line 16, in <module>
line for line in traceback.format_stack()
matplotlib.use(BACKEND)
warnings.filterwarnings('ignore')
#load data
# we have two files, since this is a part of a kaggle competition,
# only the training-set contains the target variable
# we will use that for the whole analysis
df=pd.read_csv("../data/house_prices_train.csv")
print("Number of records and variables: ",df.shape)
Number of records and variables: (1460, 81)
# for a first overview, we apply the pandas-profile report
# it provides simple histograms, distributions, missingness
# and correlations for all variables
pandas_profiling.ProfileReport(df)
Dataset info
| Number of variables | 81 |
|---|---|
| Number of observations | 1460 |
| Total Missing (%) | 5.9% |
| Total size in memory | 924.0 KiB |
| Average record size in memory | 648.1 B |
Variables types
| Numeric | 38 |
|---|---|
| Categorical | 43 |
| Boolean | 0 |
| Date | 0 |
| Text (Unique) | 0 |
| Rejected | 0 |
| Unsupported | 0 |
Warnings
2ndFlrSF has 829 / 56.8% zeros Zeros3SsnPorch has 1436 / 98.4% zeros ZerosAlley has 1369 / 93.8% missing values MissingBsmtCond has 37 / 2.5% missing values MissingBsmtExposure has 38 / 2.6% missing values MissingBsmtFinSF1 has 467 / 32.0% zeros ZerosBsmtFinSF2 has 1293 / 88.6% zeros ZerosBsmtFinType1 has 37 / 2.5% missing values MissingBsmtFinType2 has 38 / 2.6% missing values MissingBsmtFullBath has 856 / 58.6% zeros ZerosBsmtHalfBath has 1378 / 94.4% zeros ZerosBsmtQual has 37 / 2.5% missing values MissingBsmtUnfSF has 118 / 8.1% zeros ZerosEnclosedPorch has 1252 / 85.8% zeros ZerosFence has 1179 / 80.8% missing values MissingFireplaceQu has 690 / 47.3% missing values MissingFireplaces has 690 / 47.3% zeros ZerosGarageArea has 81 / 5.5% zeros ZerosGarageCars has 81 / 5.5% zeros ZerosGarageCond has 81 / 5.5% missing values MissingGarageFinish has 81 / 5.5% missing values MissingGarageQual has 81 / 5.5% missing values MissingGarageType has 81 / 5.5% missing values MissingGarageYrBlt has 81 / 5.5% missing values MissingHalfBath has 913 / 62.5% zeros ZerosLotFrontage has 259 / 17.7% missing values MissingLowQualFinSF has 1434 / 98.2% zeros ZerosMasVnrArea has 861 / 59.0% zeros ZerosMiscFeature has 1406 / 96.3% missing values MissingMiscVal is highly skewed (γ1 = 24.477) SkewedMiscVal has 1408 / 96.4% zeros ZerosOpenPorchSF has 656 / 44.9% zeros ZerosPoolArea has 1453 / 99.5% zeros ZerosPoolQC has 1453 / 99.5% missing values MissingScreenPorch has 1344 / 92.1% zeros ZerosTotalBsmtSF has 37 / 2.5% zeros ZerosWoodDeckSF has 761 / 52.1% zeros Zeros1stFlrSF
Numeric
| Distinct count | 753 |
|---|---|
| Unique (%) | 51.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1162.6 |
|---|---|
| Minimum | 334 |
| Maximum | 4692 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 334 |
|---|---|
| 5-th percentile | 672.95 |
| Q1 | 882 |
| Median | 1087 |
| Q3 | 1391.2 |
| 95-th percentile | 1831.2 |
| Maximum | 4692 |
| Range | 4358 |
| Interquartile range | 509.25 |
Descriptive statistics
| Standard deviation | 386.59 |
|---|---|
| Coef of variation | 0.33251 |
| Kurtosis | 5.7458 |
| Mean | 1162.6 |
| MAD | 300.58 |
| Skewness | 1.3768 |
| Sum | 1697435 |
| Variance | 149450 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 864 | 25 | 1.7% |
|
| 1040 | 16 | 1.1% |
|
| 912 | 14 | 1.0% |
|
| 848 | 12 | 0.8% |
|
| 894 | 12 | 0.8% |
|
| 672 | 11 | 0.8% |
|
| 816 | 9 | 0.6% |
|
| 630 | 9 | 0.6% |
|
| 936 | 7 | 0.5% |
|
| 960 | 7 | 0.5% |
|
| Other values (743) | 1338 | 91.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 334 | 1 | 0.1% |
|
| 372 | 1 | 0.1% |
|
| 438 | 1 | 0.1% |
|
| 480 | 1 | 0.1% |
|
| 483 | 7 | 0.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2633 | 1 | 0.1% |
|
| 2898 | 1 | 0.1% |
|
| 3138 | 1 | 0.1% |
|
| 3228 | 1 | 0.1% |
|
| 4692 | 1 | 0.1% |
|
2ndFlrSF
Numeric
| Distinct count | 417 |
|---|---|
| Unique (%) | 28.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 346.99 |
|---|---|
| Minimum | 0 |
| Maximum | 2065 |
| Zeros (%) | 56.8% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 728 |
| 95-th percentile | 1141 |
| Maximum | 2065 |
| Range | 2065 |
| Interquartile range | 728 |
Descriptive statistics
| Standard deviation | 436.53 |
|---|---|
| Coef of variation | 1.258 |
| Kurtosis | -0.55346 |
| Mean | 346.99 |
| MAD | 396.48 |
| Skewness | 0.81303 |
| Sum | 506609 |
| Variance | 190560 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 829 | 56.8% |
|
| 728 | 10 | 0.7% |
|
| 504 | 9 | 0.6% |
|
| 672 | 8 | 0.5% |
|
| 546 | 8 | 0.5% |
|
| 720 | 7 | 0.5% |
|
| 600 | 7 | 0.5% |
|
| 896 | 6 | 0.4% |
|
| 780 | 5 | 0.3% |
|
| 862 | 5 | 0.3% |
|
| Other values (407) | 566 | 38.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 829 | 56.8% |
|
| 110 | 1 | 0.1% |
|
| 167 | 1 | 0.1% |
|
| 192 | 1 | 0.1% |
|
| 208 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1611 | 1 | 0.1% |
|
| 1796 | 1 | 0.1% |
|
| 1818 | 1 | 0.1% |
|
| 1872 | 1 | 0.1% |
|
| 2065 | 1 | 0.1% |
|
3SsnPorch
Numeric
| Distinct count | 20 |
|---|---|
| Unique (%) | 1.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.4096 |
|---|---|
| Minimum | 0 |
| Maximum | 508 |
| Zeros (%) | 98.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 0 |
| Maximum | 508 |
| Range | 508 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 29.317 |
|---|---|
| Coef of variation | 8.5985 |
| Kurtosis | 123.66 |
| Mean | 3.4096 |
| MAD | 6.7071 |
| Skewness | 10.304 |
| Sum | 4978 |
| Variance | 859.51 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1436 | 98.4% |
|
| 168 | 3 | 0.2% |
|
| 216 | 2 | 0.1% |
|
| 144 | 2 | 0.1% |
|
| 180 | 2 | 0.1% |
|
| 245 | 1 | 0.1% |
|
| 238 | 1 | 0.1% |
|
| 290 | 1 | 0.1% |
|
| 196 | 1 | 0.1% |
|
| 182 | 1 | 0.1% |
|
| Other values (10) | 10 | 0.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1436 | 98.4% |
|
| 23 | 1 | 0.1% |
|
| 96 | 1 | 0.1% |
|
| 130 | 1 | 0.1% |
|
| 140 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 290 | 1 | 0.1% |
|
| 304 | 1 | 0.1% |
|
| 320 | 1 | 0.1% |
|
| 407 | 1 | 0.1% |
|
| 508 | 1 | 0.1% |
|
Alley
Categorical
| Distinct count | 3 |
|---|---|
| Unique (%) | 0.2% |
| Missing (%) | 93.8% |
| Missing (n) | 1369 |
| Grvl |
|
|---|---|
| Pave |
|
| (Missing) |
1369
|
| Value | Count | Frequency (%) | |
| Grvl | 50 | 3.4% |
|
| Pave | 41 | 2.8% |
|
| (Missing) | 1369 | 93.8% |
|
BedroomAbvGr
Numeric
| Distinct count | 8 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2.8664 |
|---|---|
| Minimum | 0 |
| Maximum | 8 |
| Zeros (%) | 0.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 2 |
| Q1 | 2 |
| Median | 3 |
| Q3 | 3 |
| 95-th percentile | 4 |
| Maximum | 8 |
| Range | 8 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 0.81578 |
|---|---|
| Coef of variation | 0.2846 |
| Kurtosis | 2.2309 |
| Mean | 2.8664 |
| MAD | 0.57631 |
| Skewness | 0.21179 |
| Sum | 4185 |
| Variance | 0.66549 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 3 | 804 | 55.1% |
|
| 2 | 358 | 24.5% |
|
| 4 | 213 | 14.6% |
|
| 1 | 50 | 3.4% |
|
| 5 | 21 | 1.4% |
|
| 6 | 7 | 0.5% |
|
| 0 | 6 | 0.4% |
|
| 8 | 1 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 6 | 0.4% |
|
| 1 | 50 | 3.4% |
|
| 2 | 358 | 24.5% |
|
| 3 | 804 | 55.1% |
|
| 4 | 213 | 14.6% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3 | 804 | 55.1% |
|
| 4 | 213 | 14.6% |
|
| 5 | 21 | 1.4% |
|
| 6 | 7 | 0.5% |
|
| 8 | 1 | 0.1% |
|
BldgType
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| 1Fam |
1220
|
|---|---|
| TwnhsE |
|
| Duplex |
|
| Other values (2) |
|
| Value | Count | Frequency (%) | |
| 1Fam | 1220 | 83.6% |
|
| TwnhsE | 114 | 7.8% |
|
| Duplex | 52 | 3.6% |
|
| Twnhs | 43 | 2.9% |
|
| 2fmCon | 31 | 2.1% |
|
BsmtCond
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 2.5% |
| Missing (n) | 37 |
| TA |
1311
|
|---|---|
| Gd |
|
| Fa |
|
| (Missing) |
|
| Value | Count | Frequency (%) | |
| TA | 1311 | 89.8% |
|
| Gd | 65 | 4.5% |
|
| Fa | 45 | 3.1% |
|
| Po | 2 | 0.1% |
|
| (Missing) | 37 | 2.5% |
|
BsmtExposure
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 2.6% |
| Missing (n) | 38 |
| No |
953
|
|---|---|
| Av |
221
|
| Gd |
|
| Value | Count | Frequency (%) | |
| No | 953 | 65.3% |
|
| Av | 221 | 15.1% |
|
| Gd | 134 | 9.2% |
|
| Mn | 114 | 7.8% |
|
| (Missing) | 38 | 2.6% |
|
BsmtFinSF1
Numeric
| Distinct count | 637 |
|---|---|
| Unique (%) | 43.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 443.64 |
|---|---|
| Minimum | 0 |
| Maximum | 5644 |
| Zeros (%) | 32.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 383.5 |
| Q3 | 712.25 |
| 95-th percentile | 1274 |
| Maximum | 5644 |
| Range | 5644 |
| Interquartile range | 712.25 |
Descriptive statistics
| Standard deviation | 456.1 |
|---|---|
| Coef of variation | 1.0281 |
| Kurtosis | 11.118 |
| Mean | 443.64 |
| MAD | 367.37 |
| Skewness | 1.6855 |
| Sum | 647714 |
| Variance | 208030 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 467 | 32.0% |
|
| 24 | 12 | 0.8% |
|
| 16 | 9 | 0.6% |
|
| 20 | 5 | 0.3% |
|
| 686 | 5 | 0.3% |
|
| 616 | 5 | 0.3% |
|
| 936 | 5 | 0.3% |
|
| 662 | 5 | 0.3% |
|
| 428 | 4 | 0.3% |
|
| 655 | 4 | 0.3% |
|
| Other values (627) | 939 | 64.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 467 | 32.0% |
|
| 2 | 1 | 0.1% |
|
| 16 | 9 | 0.6% |
|
| 20 | 5 | 0.3% |
|
| 24 | 12 | 0.8% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1904 | 1 | 0.1% |
|
| 2096 | 1 | 0.1% |
|
| 2188 | 1 | 0.1% |
|
| 2260 | 1 | 0.1% |
|
| 5644 | 1 | 0.1% |
|
BsmtFinSF2
Numeric
| Distinct count | 144 |
|---|---|
| Unique (%) | 9.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 46.549 |
|---|---|
| Minimum | 0 |
| Maximum | 1474 |
| Zeros (%) | 88.6% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 396.2 |
| Maximum | 1474 |
| Range | 1474 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 161.32 |
|---|---|
| Coef of variation | 3.4656 |
| Kurtosis | 20.113 |
| Mean | 46.549 |
| MAD | 82.535 |
| Skewness | 4.2553 |
| Sum | 67962 |
| Variance | 26024 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1293 | 88.6% |
|
| 180 | 5 | 0.3% |
|
| 374 | 3 | 0.2% |
|
| 551 | 2 | 0.1% |
|
| 93 | 2 | 0.1% |
|
| 468 | 2 | 0.1% |
|
| 147 | 2 | 0.1% |
|
| 480 | 2 | 0.1% |
|
| 539 | 2 | 0.1% |
|
| 712 | 2 | 0.1% |
|
| Other values (134) | 145 | 9.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1293 | 88.6% |
|
| 28 | 1 | 0.1% |
|
| 32 | 1 | 0.1% |
|
| 35 | 1 | 0.1% |
|
| 40 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1080 | 1 | 0.1% |
|
| 1085 | 1 | 0.1% |
|
| 1120 | 1 | 0.1% |
|
| 1127 | 1 | 0.1% |
|
| 1474 | 1 | 0.1% |
|
BsmtFinType1
Categorical
| Distinct count | 7 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 2.5% |
| Missing (n) | 37 |
| Unf |
430
|
|---|---|
| GLQ |
418
|
| ALQ |
220
|
| Other values (3) |
355
|
| Value | Count | Frequency (%) | |
| Unf | 430 | 29.5% |
|
| GLQ | 418 | 28.6% |
|
| ALQ | 220 | 15.1% |
|
| BLQ | 148 | 10.1% |
|
| Rec | 133 | 9.1% |
|
| LwQ | 74 | 5.1% |
|
| (Missing) | 37 | 2.5% |
|
BsmtFinType2
Categorical
| Distinct count | 7 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 2.6% |
| Missing (n) | 38 |
| Unf |
1256
|
|---|---|
| Rec |
|
| LwQ |
|
| Other values (3) |
|
| (Missing) |
|
| Value | Count | Frequency (%) | |
| Unf | 1256 | 86.0% |
|
| Rec | 54 | 3.7% |
|
| LwQ | 46 | 3.2% |
|
| BLQ | 33 | 2.3% |
|
| ALQ | 19 | 1.3% |
|
| GLQ | 14 | 1.0% |
|
| (Missing) | 38 | 2.6% |
|
BsmtFullBath
Numeric
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.42534 |
|---|---|
| Minimum | 0 |
| Maximum | 3 |
| Zeros (%) | 58.6% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 1 |
| 95-th percentile | 1 |
| Maximum | 3 |
| Range | 3 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 0.51891 |
|---|---|
| Coef of variation | 1.22 |
| Kurtosis | -0.8391 |
| Mean | 0.42534 |
| MAD | 0.49876 |
| Skewness | 0.59607 |
| Sum | 621 |
| Variance | 0.26927 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 856 | 58.6% |
|
| 1 | 588 | 40.3% |
|
| 2 | 15 | 1.0% |
|
| 3 | 1 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 856 | 58.6% |
|
| 1 | 588 | 40.3% |
|
| 2 | 15 | 1.0% |
|
| 3 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 856 | 58.6% |
|
| 1 | 588 | 40.3% |
|
| 2 | 15 | 1.0% |
|
| 3 | 1 | 0.1% |
|
BsmtHalfBath
Numeric
| Distinct count | 3 |
|---|---|
| Unique (%) | 0.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.057534 |
|---|---|
| Minimum | 0 |
| Maximum | 2 |
| Zeros (%) | 94.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 1 |
| Maximum | 2 |
| Range | 2 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.23875 |
|---|---|
| Coef of variation | 4.1497 |
| Kurtosis | 16.397 |
| Mean | 0.057534 |
| MAD | 0.10861 |
| Skewness | 4.1034 |
| Sum | 84 |
| Variance | 0.057003 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1378 | 94.4% |
|
| 1 | 80 | 5.5% |
|
| 2 | 2 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1378 | 94.4% |
|
| 1 | 80 | 5.5% |
|
| 2 | 2 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1378 | 94.4% |
|
| 1 | 80 | 5.5% |
|
| 2 | 2 | 0.1% |
|
BsmtQual
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 2.5% |
| Missing (n) | 37 |
| TA |
649
|
|---|---|
| Gd |
618
|
| Ex |
|
| (Missing) |
|
| Value | Count | Frequency (%) | |
| TA | 649 | 44.5% |
|
| Gd | 618 | 42.3% |
|
| Ex | 121 | 8.3% |
|
| Fa | 35 | 2.4% |
|
| (Missing) | 37 | 2.5% |
|
BsmtUnfSF
Numeric
| Distinct count | 780 |
|---|---|
| Unique (%) | 53.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 567.24 |
|---|---|
| Minimum | 0 |
| Maximum | 2336 |
| Zeros (%) | 8.1% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 223 |
| Median | 477.5 |
| Q3 | 808 |
| 95-th percentile | 1468 |
| Maximum | 2336 |
| Range | 2336 |
| Interquartile range | 585 |
Descriptive statistics
| Standard deviation | 441.87 |
|---|---|
| Coef of variation | 0.77898 |
| Kurtosis | 0.47499 |
| Mean | 567.24 |
| MAD | 353.28 |
| Skewness | 0.92027 |
| Sum | 828171 |
| Variance | 195250 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 118 | 8.1% |
|
| 728 | 9 | 0.6% |
|
| 384 | 8 | 0.5% |
|
| 572 | 7 | 0.5% |
|
| 600 | 7 | 0.5% |
|
| 300 | 7 | 0.5% |
|
| 440 | 6 | 0.4% |
|
| 625 | 6 | 0.4% |
|
| 280 | 6 | 0.4% |
|
| 672 | 6 | 0.4% |
|
| Other values (770) | 1280 | 87.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 118 | 8.1% |
|
| 14 | 1 | 0.1% |
|
| 15 | 1 | 0.1% |
|
| 23 | 2 | 0.1% |
|
| 26 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2042 | 1 | 0.1% |
|
| 2046 | 1 | 0.1% |
|
| 2121 | 1 | 0.1% |
|
| 2153 | 1 | 0.1% |
|
| 2336 | 1 | 0.1% |
|
CentralAir
Categorical
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Y |
1365
|
|---|---|
| N |
|
| Value | Count | Frequency (%) | |
| Y | 1365 | 93.5% |
|
| N | 95 | 6.5% |
|
Condition1
Categorical
| Distinct count | 9 |
|---|---|
| Unique (%) | 0.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Norm |
1260
|
|---|---|
| Feedr |
|
| Artery |
|
| Other values (6) |
|
| Value | Count | Frequency (%) | |
| Norm | 1260 | 86.3% |
|
| Feedr | 81 | 5.5% |
|
| Artery | 48 | 3.3% |
|
| RRAn | 26 | 1.8% |
|
| PosN | 19 | 1.3% |
|
| RRAe | 11 | 0.8% |
|
| PosA | 8 | 0.5% |
|
| RRNn | 5 | 0.3% |
|
| RRNe | 2 | 0.1% |
|
Condition2
Categorical
| Distinct count | 8 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Norm |
1445
|
|---|---|
| Feedr |
|
| Artery |
|
| Other values (5) |
|
| Value | Count | Frequency (%) | |
| Norm | 1445 | 99.0% |
|
| Feedr | 6 | 0.4% |
|
| Artery | 2 | 0.1% |
|
| RRNn | 2 | 0.1% |
|
| PosN | 2 | 0.1% |
|
| RRAe | 1 | 0.1% |
|
| PosA | 1 | 0.1% |
|
| RRAn | 1 | 0.1% |
|
Electrical
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 0.1% |
| Missing (n) | 1 |
| SBrkr |
1334
|
|---|---|
| FuseA |
|
| FuseF |
|
| Other values (2) |
|
| Value | Count | Frequency (%) | |
| SBrkr | 1334 | 91.4% |
|
| FuseA | 94 | 6.4% |
|
| FuseF | 27 | 1.8% |
|
| FuseP | 3 | 0.2% |
|
| Mix | 1 | 0.1% |
|
| (Missing) | 1 | 0.1% |
|
EnclosedPorch
Numeric
| Distinct count | 120 |
|---|---|
| Unique (%) | 8.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 21.954 |
|---|---|
| Minimum | 0 |
| Maximum | 552 |
| Zeros (%) | 85.8% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 180.15 |
| Maximum | 552 |
| Range | 552 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 61.119 |
|---|---|
| Coef of variation | 2.784 |
| Kurtosis | 10.431 |
| Mean | 21.954 |
| MAD | 37.66 |
| Skewness | 3.0899 |
| Sum | 32053 |
| Variance | 3735.6 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1252 | 85.8% |
|
| 112 | 15 | 1.0% |
|
| 96 | 6 | 0.4% |
|
| 120 | 5 | 0.3% |
|
| 144 | 5 | 0.3% |
|
| 192 | 5 | 0.3% |
|
| 216 | 5 | 0.3% |
|
| 252 | 4 | 0.3% |
|
| 116 | 4 | 0.3% |
|
| 156 | 4 | 0.3% |
|
| Other values (110) | 155 | 10.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1252 | 85.8% |
|
| 19 | 1 | 0.1% |
|
| 20 | 1 | 0.1% |
|
| 24 | 1 | 0.1% |
|
| 30 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 301 | 1 | 0.1% |
|
| 318 | 1 | 0.1% |
|
| 330 | 1 | 0.1% |
|
| 386 | 1 | 0.1% |
|
| 552 | 1 | 0.1% |
|
ExterCond
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| TA |
1282
|
|---|---|
| Gd |
|
| Fa |
|
| Other values (2) |
|
| Value | Count | Frequency (%) | |
| TA | 1282 | 87.8% |
|
| Gd | 146 | 10.0% |
|
| Fa | 28 | 1.9% |
|
| Ex | 3 | 0.2% |
|
| Po | 1 | 0.1% |
|
ExterQual
Categorical
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| TA |
906
|
|---|---|
| Gd |
488
|
| Ex |
|
| Value | Count | Frequency (%) | |
| TA | 906 | 62.1% |
|
| Gd | 488 | 33.4% |
|
| Ex | 52 | 3.6% |
|
| Fa | 14 | 1.0% |
|
Exterior1st
Categorical
| Distinct count | 15 |
|---|---|
| Unique (%) | 1.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| VinylSd |
515
|
|---|---|
| HdBoard |
222
|
| MetalSd |
220
|
| Other values (12) |
503
|
| Value | Count | Frequency (%) | |
| VinylSd | 515 | 35.3% |
|
| HdBoard | 222 | 15.2% |
|
| MetalSd | 220 | 15.1% |
|
| Wd Sdng | 206 | 14.1% |
|
| Plywood | 108 | 7.4% |
|
| CemntBd | 61 | 4.2% |
|
| BrkFace | 50 | 3.4% |
|
| WdShing | 26 | 1.8% |
|
| Stucco | 25 | 1.7% |
|
| AsbShng | 20 | 1.4% |
|
| Other values (5) | 7 | 0.5% |
|
Exterior2nd
Categorical
| Distinct count | 16 |
|---|---|
| Unique (%) | 1.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| VinylSd |
504
|
|---|---|
| MetalSd |
214
|
| HdBoard |
207
|
| Other values (13) |
535
|
| Value | Count | Frequency (%) | |
| VinylSd | 504 | 34.5% |
|
| MetalSd | 214 | 14.7% |
|
| HdBoard | 207 | 14.2% |
|
| Wd Sdng | 197 | 13.5% |
|
| Plywood | 142 | 9.7% |
|
| CmentBd | 60 | 4.1% |
|
| Wd Shng | 38 | 2.6% |
|
| Stucco | 26 | 1.8% |
|
| BrkFace | 25 | 1.7% |
|
| AsbShng | 20 | 1.4% |
|
| Other values (6) | 27 | 1.8% |
|
Fence
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 80.8% |
| Missing (n) | 1179 |
| MnPrv |
|
|---|---|
| GdPrv |
|
| GdWo |
|
| (Missing) |
1179
|
| Value | Count | Frequency (%) | |
| MnPrv | 157 | 10.8% |
|
| GdPrv | 59 | 4.0% |
|
| GdWo | 54 | 3.7% |
|
| MnWw | 11 | 0.8% |
|
| (Missing) | 1179 | 80.8% |
|
FireplaceQu
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 47.3% |
| Missing (n) | 690 |
| Gd |
380
|
|---|---|
| TA |
313
|
| Fa |
|
| Other values (2) |
|
| (Missing) |
690
|
| Value | Count | Frequency (%) | |
| Gd | 380 | 26.0% |
|
| TA | 313 | 21.4% |
|
| Fa | 33 | 2.3% |
|
| Ex | 24 | 1.6% |
|
| Po | 20 | 1.4% |
|
| (Missing) | 690 | 47.3% |
|
Fireplaces
Numeric
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.61301 |
|---|---|
| Minimum | 0 |
| Maximum | 3 |
| Zeros (%) | 47.3% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 1 |
| Q3 | 1 |
| 95-th percentile | 2 |
| Maximum | 3 |
| Range | 3 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 0.64467 |
|---|---|
| Coef of variation | 1.0516 |
| Kurtosis | -0.21724 |
| Mean | 0.61301 |
| MAD | 0.57942 |
| Skewness | 0.64957 |
| Sum | 895 |
| Variance | 0.41559 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 690 | 47.3% |
|
| 1 | 650 | 44.5% |
|
| 2 | 115 | 7.9% |
|
| 3 | 5 | 0.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 690 | 47.3% |
|
| 1 | 650 | 44.5% |
|
| 2 | 115 | 7.9% |
|
| 3 | 5 | 0.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 690 | 47.3% |
|
| 1 | 650 | 44.5% |
|
| 2 | 115 | 7.9% |
|
| 3 | 5 | 0.3% |
|
Foundation
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| PConc |
647
|
|---|---|
| CBlock |
634
|
| BrkTil |
146
|
| Other values (3) |
|
| Value | Count | Frequency (%) | |
| PConc | 647 | 44.3% |
|
| CBlock | 634 | 43.4% |
|
| BrkTil | 146 | 10.0% |
|
| Slab | 24 | 1.6% |
|
| Stone | 6 | 0.4% |
|
| Wood | 3 | 0.2% |
|
FullBath
Numeric
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.5651 |
|---|---|
| Minimum | 0 |
| Maximum | 3 |
| Zeros (%) | 0.6% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 1 |
| Median | 2 |
| Q3 | 2 |
| 95-th percentile | 2 |
| Maximum | 3 |
| Range | 3 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 0.55092 |
|---|---|
| Coef of variation | 0.35201 |
| Kurtosis | -0.85704 |
| Mean | 1.5651 |
| MAD | 0.52244 |
| Skewness | 0.036562 |
| Sum | 2285 |
| Variance | 0.30351 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 2 | 768 | 52.6% |
|
| 1 | 650 | 44.5% |
|
| 3 | 33 | 2.3% |
|
| 0 | 9 | 0.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 9 | 0.6% |
|
| 1 | 650 | 44.5% |
|
| 2 | 768 | 52.6% |
|
| 3 | 33 | 2.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 9 | 0.6% |
|
| 1 | 650 | 44.5% |
|
| 2 | 768 | 52.6% |
|
| 3 | 33 | 2.3% |
|
Functional
Categorical
| Distinct count | 7 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Typ |
1360
|
|---|---|
| Min2 |
|
| Min1 |
|
| Other values (4) |
|
| Value | Count | Frequency (%) | |
| Typ | 1360 | 93.2% |
|
| Min2 | 34 | 2.3% |
|
| Min1 | 31 | 2.1% |
|
| Mod | 15 | 1.0% |
|
| Maj1 | 14 | 1.0% |
|
| Maj2 | 5 | 0.3% |
|
| Sev | 1 | 0.1% |
|
GarageArea
Numeric
| Distinct count | 441 |
|---|---|
| Unique (%) | 30.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 472.98 |
|---|---|
| Minimum | 0 |
| Maximum | 1418 |
| Zeros (%) | 5.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 334.5 |
| Median | 480 |
| Q3 | 576 |
| 95-th percentile | 850.1 |
| Maximum | 1418 |
| Range | 1418 |
| Interquartile range | 241.5 |
Descriptive statistics
| Standard deviation | 213.8 |
|---|---|
| Coef of variation | 0.45204 |
| Kurtosis | 0.91707 |
| Mean | 472.98 |
| MAD | 160.02 |
| Skewness | 0.17998 |
| Sum | 690551 |
| Variance | 45713 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 81 | 5.5% |
|
| 440 | 49 | 3.4% |
|
| 576 | 47 | 3.2% |
|
| 240 | 38 | 2.6% |
|
| 484 | 34 | 2.3% |
|
| 528 | 33 | 2.3% |
|
| 288 | 27 | 1.8% |
|
| 400 | 25 | 1.7% |
|
| 480 | 24 | 1.6% |
|
| 264 | 24 | 1.6% |
|
| Other values (431) | 1078 | 73.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 81 | 5.5% |
|
| 160 | 2 | 0.1% |
|
| 164 | 1 | 0.1% |
|
| 180 | 9 | 0.6% |
|
| 186 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1220 | 1 | 0.1% |
|
| 1248 | 1 | 0.1% |
|
| 1356 | 1 | 0.1% |
|
| 1390 | 1 | 0.1% |
|
| 1418 | 1 | 0.1% |
|
GarageCars
Numeric
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.7671 |
|---|---|
| Minimum | 0 |
| Maximum | 4 |
| Zeros (%) | 5.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 1 |
| Median | 2 |
| Q3 | 2 |
| 95-th percentile | 3 |
| Maximum | 4 |
| Range | 4 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 0.74732 |
|---|---|
| Coef of variation | 0.4229 |
| Kurtosis | 0.221 |
| Mean | 1.7671 |
| MAD | 0.58384 |
| Skewness | -0.34255 |
| Sum | 2580 |
| Variance | 0.55848 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 2 | 824 | 56.4% |
|
| 1 | 369 | 25.3% |
|
| 3 | 181 | 12.4% |
|
| 0 | 81 | 5.5% |
|
| 4 | 5 | 0.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 81 | 5.5% |
|
| 1 | 369 | 25.3% |
|
| 2 | 824 | 56.4% |
|
| 3 | 181 | 12.4% |
|
| 4 | 5 | 0.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 81 | 5.5% |
|
| 1 | 369 | 25.3% |
|
| 2 | 824 | 56.4% |
|
| 3 | 181 | 12.4% |
|
| 4 | 5 | 0.3% |
|
GarageCond
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 5.5% |
| Missing (n) | 81 |
| TA |
1326
|
|---|---|
| Fa |
|
| Gd |
|
| Other values (2) |
|
| (Missing) |
|
| Value | Count | Frequency (%) | |
| TA | 1326 | 90.8% |
|
| Fa | 35 | 2.4% |
|
| Gd | 9 | 0.6% |
|
| Po | 7 | 0.5% |
|
| Ex | 2 | 0.1% |
|
| (Missing) | 81 | 5.5% |
|
GarageFinish
Categorical
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 5.5% |
| Missing (n) | 81 |
| Unf |
605
|
|---|---|
| RFn |
422
|
| Fin |
352
|
| (Missing) |
|
| Value | Count | Frequency (%) | |
| Unf | 605 | 41.4% |
|
| RFn | 422 | 28.9% |
|
| Fin | 352 | 24.1% |
|
| (Missing) | 81 | 5.5% |
|
GarageQual
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 5.5% |
| Missing (n) | 81 |
| TA |
1311
|
|---|---|
| Fa |
|
| Gd |
|
| Other values (2) |
|
| (Missing) |
|
| Value | Count | Frequency (%) | |
| TA | 1311 | 89.8% |
|
| Fa | 48 | 3.3% |
|
| Gd | 14 | 1.0% |
|
| Po | 3 | 0.2% |
|
| Ex | 3 | 0.2% |
|
| (Missing) | 81 | 5.5% |
|
GarageType
Categorical
| Distinct count | 7 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 5.5% |
| Missing (n) | 81 |
| Attchd |
870
|
|---|---|
| Detchd |
387
|
| BuiltIn |
|
| Other values (3) |
|
| (Missing) |
|
| Value | Count | Frequency (%) | |
| Attchd | 870 | 59.6% |
|
| Detchd | 387 | 26.5% |
|
| BuiltIn | 88 | 6.0% |
|
| Basment | 19 | 1.3% |
|
| CarPort | 9 | 0.6% |
|
| 2Types | 6 | 0.4% |
|
| (Missing) | 81 | 5.5% |
|
GarageYrBlt
Numeric
| Distinct count | 98 |
|---|---|
| Unique (%) | 6.7% |
| Missing (%) | 5.5% |
| Missing (n) | 81 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1978.5 |
|---|---|
| Minimum | 1900 |
| Maximum | 2010 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1900 |
|---|---|
| 5-th percentile | 1930 |
| Q1 | 1961 |
| Median | 1980 |
| Q3 | 2002 |
| 95-th percentile | 2007 |
| Maximum | 2010 |
| Range | 110 |
| Interquartile range | 41 |
Descriptive statistics
| Standard deviation | 24.69 |
|---|---|
| Coef of variation | 0.012479 |
| Kurtosis | -0.41834 |
| Mean | 1978.5 |
| MAD | 20.913 |
| Skewness | -0.64941 |
| Sum | 2728400 |
| Variance | 609.58 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 2005.0 | 65 | 4.5% |
|
| 2006.0 | 59 | 4.0% |
|
| 2004.0 | 53 | 3.6% |
|
| 2003.0 | 50 | 3.4% |
|
| 2007.0 | 49 | 3.4% |
|
| 1977.0 | 35 | 2.4% |
|
| 1998.0 | 31 | 2.1% |
|
| 1999.0 | 30 | 2.1% |
|
| 1976.0 | 29 | 2.0% |
|
| 2008.0 | 29 | 2.0% |
|
| Other values (87) | 949 | 65.0% |
|
| (Missing) | 81 | 5.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1900.0 | 1 | 0.1% |
|
| 1906.0 | 1 | 0.1% |
|
| 1908.0 | 1 | 0.1% |
|
| 1910.0 | 3 | 0.2% |
|
| 1914.0 | 2 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2006.0 | 59 | 4.0% |
|
| 2007.0 | 49 | 3.4% |
|
| 2008.0 | 29 | 2.0% |
|
| 2009.0 | 21 | 1.4% |
|
| 2010.0 | 3 | 0.2% |
|
GrLivArea
Numeric
| Distinct count | 861 |
|---|---|
| Unique (%) | 59.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1515.5 |
|---|---|
| Minimum | 334 |
| Maximum | 5642 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 334 |
|---|---|
| 5-th percentile | 848 |
| Q1 | 1129.5 |
| Median | 1464 |
| Q3 | 1776.8 |
| 95-th percentile | 2466.1 |
| Maximum | 5642 |
| Range | 5308 |
| Interquartile range | 647.25 |
Descriptive statistics
| Standard deviation | 525.48 |
|---|---|
| Coef of variation | 0.34675 |
| Kurtosis | 4.8951 |
| Mean | 1515.5 |
| MAD | 397.32 |
| Skewness | 1.3666 |
| Sum | 2212577 |
| Variance | 276130 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 864 | 22 | 1.5% |
|
| 1040 | 14 | 1.0% |
|
| 894 | 11 | 0.8% |
|
| 848 | 10 | 0.7% |
|
| 1456 | 10 | 0.7% |
|
| 912 | 9 | 0.6% |
|
| 1200 | 9 | 0.6% |
|
| 816 | 8 | 0.5% |
|
| 1092 | 8 | 0.5% |
|
| 1344 | 7 | 0.5% |
|
| Other values (851) | 1352 | 92.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 334 | 1 | 0.1% |
|
| 438 | 1 | 0.1% |
|
| 480 | 1 | 0.1% |
|
| 520 | 1 | 0.1% |
|
| 605 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3627 | 1 | 0.1% |
|
| 4316 | 1 | 0.1% |
|
| 4476 | 1 | 0.1% |
|
| 4676 | 1 | 0.1% |
|
| 5642 | 1 | 0.1% |
|
HalfBath
Numeric
| Distinct count | 3 |
|---|---|
| Unique (%) | 0.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.38288 |
|---|---|
| Minimum | 0 |
| Maximum | 2 |
| Zeros (%) | 62.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 1 |
| 95-th percentile | 1 |
| Maximum | 2 |
| Range | 2 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 0.50289 |
|---|---|
| Coef of variation | 1.3134 |
| Kurtosis | -1.0769 |
| Mean | 0.38288 |
| MAD | 0.47886 |
| Skewness | 0.6759 |
| Sum | 559 |
| Variance | 0.25289 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 913 | 62.5% |
|
| 1 | 535 | 36.6% |
|
| 2 | 12 | 0.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 913 | 62.5% |
|
| 1 | 535 | 36.6% |
|
| 2 | 12 | 0.8% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 913 | 62.5% |
|
| 1 | 535 | 36.6% |
|
| 2 | 12 | 0.8% |
|
Heating
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| GasA |
1428
|
|---|---|
| GasW |
|
| Grav |
|
| Other values (3) |
|
| Value | Count | Frequency (%) | |
| GasA | 1428 | 97.8% |
|
| GasW | 18 | 1.2% |
|
| Grav | 7 | 0.5% |
|
| Wall | 4 | 0.3% |
|
| OthW | 2 | 0.1% |
|
| Floor | 1 | 0.1% |
|
HeatingQC
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Ex |
741
|
|---|---|
| TA |
428
|
| Gd |
241
|
| Other values (2) |
|
| Value | Count | Frequency (%) | |
| Ex | 741 | 50.8% |
|
| TA | 428 | 29.3% |
|
| Gd | 241 | 16.5% |
|
| Fa | 49 | 3.4% |
|
| Po | 1 | 0.1% |
|
HouseStyle
Categorical
| Distinct count | 8 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| 1Story |
726
|
|---|---|
| 2Story |
445
|
| 1.5Fin |
154
|
| Other values (5) |
|
| Value | Count | Frequency (%) | |
| 1Story | 726 | 49.7% |
|
| 2Story | 445 | 30.5% |
|
| 1.5Fin | 154 | 10.5% |
|
| SLvl | 65 | 4.5% |
|
| SFoyer | 37 | 2.5% |
|
| 1.5Unf | 14 | 1.0% |
|
| 2.5Unf | 11 | 0.8% |
|
| 2.5Fin | 8 | 0.5% |
|
Id
Numeric
| Distinct count | 1460 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 730.5 |
|---|---|
| Minimum | 1 |
| Maximum | 1460 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 73.95 |
| Q1 | 365.75 |
| Median | 730.5 |
| Q3 | 1095.2 |
| 95-th percentile | 1387 |
| Maximum | 1460 |
| Range | 1459 |
| Interquartile range | 729.5 |
Descriptive statistics
| Standard deviation | 421.61 |
|---|---|
| Coef of variation | 0.57715 |
| Kurtosis | -1.2 |
| Mean | 730.5 |
| MAD | 365 |
| Skewness | 0 |
| Sum | 1066530 |
| Variance | 177760 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 1460 | 1 | 0.1% |
|
| 479 | 1 | 0.1% |
|
| 481 | 1 | 0.1% |
|
| 482 | 1 | 0.1% |
|
| 483 | 1 | 0.1% |
|
| 484 | 1 | 0.1% |
|
| 485 | 1 | 0.1% |
|
| 486 | 1 | 0.1% |
|
| 487 | 1 | 0.1% |
|
| 488 | 1 | 0.1% |
|
| Other values (1450) | 1450 | 99.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 1 | 0.1% |
|
| 2 | 1 | 0.1% |
|
| 3 | 1 | 0.1% |
|
| 4 | 1 | 0.1% |
|
| 5 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1456 | 1 | 0.1% |
|
| 1457 | 1 | 0.1% |
|
| 1458 | 1 | 0.1% |
|
| 1459 | 1 | 0.1% |
|
| 1460 | 1 | 0.1% |
|
KitchenAbvGr
Numeric
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.0466 |
|---|---|
| Minimum | 0 |
| Maximum | 3 |
| Zeros (%) | 0.1% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 1 |
| Q1 | 1 |
| Median | 1 |
| Q3 | 1 |
| 95-th percentile | 1 |
| Maximum | 3 |
| Range | 3 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.22034 |
|---|---|
| Coef of variation | 0.21053 |
| Kurtosis | 21.532 |
| Mean | 1.0466 |
| MAD | 0.090246 |
| Skewness | 4.4884 |
| Sum | 1528 |
| Variance | 0.048549 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 1 | 1392 | 95.3% |
|
| 2 | 65 | 4.5% |
|
| 3 | 2 | 0.1% |
|
| 0 | 1 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1 | 0.1% |
|
| 1 | 1392 | 95.3% |
|
| 2 | 65 | 4.5% |
|
| 3 | 2 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1 | 0.1% |
|
| 1 | 1392 | 95.3% |
|
| 2 | 65 | 4.5% |
|
| 3 | 2 | 0.1% |
|
KitchenQual
Categorical
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| TA |
735
|
|---|---|
| Gd |
586
|
| Ex |
|
| Value | Count | Frequency (%) | |
| TA | 735 | 50.3% |
|
| Gd | 586 | 40.1% |
|
| Ex | 100 | 6.8% |
|
| Fa | 39 | 2.7% |
|
LandContour
Categorical
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Lvl |
1311
|
|---|---|
| Bnk |
|
| HLS |
|
| Value | Count | Frequency (%) | |
| Lvl | 1311 | 89.8% |
|
| Bnk | 63 | 4.3% |
|
| HLS | 50 | 3.4% |
|
| Low | 36 | 2.5% |
|
LandSlope
Categorical
| Distinct count | 3 |
|---|---|
| Unique (%) | 0.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Gtl |
1382
|
|---|---|
| Mod |
|
| Sev |
|
| Value | Count | Frequency (%) | |
| Gtl | 1382 | 94.7% |
|
| Mod | 65 | 4.5% |
|
| Sev | 13 | 0.9% |
|
LotArea
Numeric
| Distinct count | 1073 |
|---|---|
| Unique (%) | 73.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 10517 |
|---|---|
| Minimum | 1300 |
| Maximum | 215245 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1300 |
|---|---|
| 5-th percentile | 3311.7 |
| Q1 | 7553.5 |
| Median | 9478.5 |
| Q3 | 11602 |
| 95-th percentile | 17401 |
| Maximum | 215245 |
| Range | 213945 |
| Interquartile range | 4048 |
Descriptive statistics
| Standard deviation | 9981.3 |
|---|---|
| Coef of variation | 0.94908 |
| Kurtosis | 203.24 |
| Mean | 10517 |
| MAD | 3758.8 |
| Skewness | 12.208 |
| Sum | 15354569 |
| Variance | 99626000 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 7200 | 25 | 1.7% |
|
| 9600 | 24 | 1.6% |
|
| 6000 | 17 | 1.2% |
|
| 10800 | 14 | 1.0% |
|
| 9000 | 14 | 1.0% |
|
| 8400 | 14 | 1.0% |
|
| 1680 | 10 | 0.7% |
|
| 7500 | 9 | 0.6% |
|
| 8125 | 8 | 0.5% |
|
| 9100 | 8 | 0.5% |
|
| Other values (1063) | 1317 | 90.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1300 | 1 | 0.1% |
|
| 1477 | 1 | 0.1% |
|
| 1491 | 1 | 0.1% |
|
| 1526 | 1 | 0.1% |
|
| 1533 | 2 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 70761 | 1 | 0.1% |
|
| 115149 | 1 | 0.1% |
|
| 159000 | 1 | 0.1% |
|
| 164660 | 1 | 0.1% |
|
| 215245 | 1 | 0.1% |
|
LotConfig
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Inside |
1052
|
|---|---|
| Corner |
263
|
| CulDSac |
|
| Other values (2) |
|
| Value | Count | Frequency (%) | |
| Inside | 1052 | 72.1% |
|
| Corner | 263 | 18.0% |
|
| CulDSac | 94 | 6.4% |
|
| FR2 | 47 | 3.2% |
|
| FR3 | 4 | 0.3% |
|
LotFrontage
Numeric
| Distinct count | 111 |
|---|---|
| Unique (%) | 7.6% |
| Missing (%) | 17.7% |
| Missing (n) | 259 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 70.05 |
|---|---|
| Minimum | 21 |
| Maximum | 313 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 21 |
|---|---|
| 5-th percentile | 34 |
| Q1 | 59 |
| Median | 69 |
| Q3 | 80 |
| 95-th percentile | 107 |
| Maximum | 313 |
| Range | 292 |
| Interquartile range | 21 |
Descriptive statistics
| Standard deviation | 24.285 |
|---|---|
| Coef of variation | 0.34668 |
| Kurtosis | 17.453 |
| Mean | 70.05 |
| MAD | 16.762 |
| Skewness | 2.1636 |
| Sum | 84130 |
| Variance | 589.75 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 60.0 | 143 | 9.8% |
|
| 70.0 | 70 | 4.8% |
|
| 80.0 | 69 | 4.7% |
|
| 50.0 | 57 | 3.9% |
|
| 75.0 | 53 | 3.6% |
|
| 65.0 | 44 | 3.0% |
|
| 85.0 | 40 | 2.7% |
|
| 78.0 | 25 | 1.7% |
|
| 21.0 | 23 | 1.6% |
|
| 90.0 | 23 | 1.6% |
|
| Other values (100) | 654 | 44.8% |
|
| (Missing) | 259 | 17.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 21.0 | 23 | 1.6% |
|
| 24.0 | 19 | 1.3% |
|
| 30.0 | 6 | 0.4% |
|
| 32.0 | 5 | 0.3% |
|
| 33.0 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 160.0 | 1 | 0.1% |
|
| 168.0 | 1 | 0.1% |
|
| 174.0 | 2 | 0.1% |
|
| 182.0 | 1 | 0.1% |
|
| 313.0 | 2 | 0.1% |
|
LotShape
Categorical
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Reg |
925
|
|---|---|
| IR1 |
484
|
| IR2 |
|
| Value | Count | Frequency (%) | |
| Reg | 925 | 63.4% |
|
| IR1 | 484 | 33.2% |
|
| IR2 | 41 | 2.8% |
|
| IR3 | 10 | 0.7% |
|
LowQualFinSF
Numeric
| Distinct count | 24 |
|---|---|
| Unique (%) | 1.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 5.8445 |
|---|---|
| Minimum | 0 |
| Maximum | 572 |
| Zeros (%) | 98.2% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 0 |
| Maximum | 572 |
| Range | 572 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 48.623 |
|---|---|
| Coef of variation | 8.3194 |
| Kurtosis | 83.235 |
| Mean | 5.8445 |
| MAD | 11.481 |
| Skewness | 9.0113 |
| Sum | 8533 |
| Variance | 2364.2 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1434 | 98.2% |
|
| 80 | 3 | 0.2% |
|
| 360 | 2 | 0.1% |
|
| 528 | 1 | 0.1% |
|
| 53 | 1 | 0.1% |
|
| 120 | 1 | 0.1% |
|
| 144 | 1 | 0.1% |
|
| 156 | 1 | 0.1% |
|
| 205 | 1 | 0.1% |
|
| 232 | 1 | 0.1% |
|
| Other values (14) | 14 | 1.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1434 | 98.2% |
|
| 53 | 1 | 0.1% |
|
| 80 | 3 | 0.2% |
|
| 120 | 1 | 0.1% |
|
| 144 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 513 | 1 | 0.1% |
|
| 514 | 1 | 0.1% |
|
| 515 | 1 | 0.1% |
|
| 528 | 1 | 0.1% |
|
| 572 | 1 | 0.1% |
|
MSSubClass
Numeric
| Distinct count | 15 |
|---|---|
| Unique (%) | 1.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 56.897 |
|---|---|
| Minimum | 20 |
| Maximum | 190 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 20 |
|---|---|
| 5-th percentile | 20 |
| Q1 | 20 |
| Median | 50 |
| Q3 | 70 |
| 95-th percentile | 160 |
| Maximum | 190 |
| Range | 170 |
| Interquartile range | 50 |
Descriptive statistics
| Standard deviation | 42.301 |
|---|---|
| Coef of variation | 0.74346 |
| Kurtosis | 1.5802 |
| Mean | 56.897 |
| MAD | 31.283 |
| Skewness | 1.4077 |
| Sum | 83070 |
| Variance | 1789.3 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 20 | 536 | 36.7% |
|
| 60 | 299 | 20.5% |
|
| 50 | 144 | 9.9% |
|
| 120 | 87 | 6.0% |
|
| 30 | 69 | 4.7% |
|
| 160 | 63 | 4.3% |
|
| 70 | 60 | 4.1% |
|
| 80 | 58 | 4.0% |
|
| 90 | 52 | 3.6% |
|
| 190 | 30 | 2.1% |
|
| Other values (5) | 62 | 4.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 20 | 536 | 36.7% |
|
| 30 | 69 | 4.7% |
|
| 40 | 4 | 0.3% |
|
| 45 | 12 | 0.8% |
|
| 50 | 144 | 9.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 90 | 52 | 3.6% |
|
| 120 | 87 | 6.0% |
|
| 160 | 63 | 4.3% |
|
| 180 | 10 | 0.7% |
|
| 190 | 30 | 2.1% |
|
MSZoning
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| RL |
1151
|
|---|---|
| RM |
|
| FV |
|
| Other values (2) |
|
| Value | Count | Frequency (%) | |
| RL | 1151 | 78.8% |
|
| RM | 218 | 14.9% |
|
| FV | 65 | 4.5% |
|
| RH | 16 | 1.1% |
|
| C (all) | 10 | 0.7% |
|
MasVnrArea
Numeric
| Distinct count | 328 |
|---|---|
| Unique (%) | 22.5% |
| Missing (%) | 0.5% |
| Missing (n) | 8 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 103.69 |
|---|---|
| Minimum | 0 |
| Maximum | 1600 |
| Zeros (%) | 59.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 166 |
| 95-th percentile | 456 |
| Maximum | 1600 |
| Range | 1600 |
| Interquartile range | 166 |
Descriptive statistics
| Standard deviation | 181.07 |
|---|---|
| Coef of variation | 1.7463 |
| Kurtosis | 10.082 |
| Mean | 103.69 |
| MAD | 129.78 |
| Skewness | 2.6691 |
| Sum | 150550 |
| Variance | 32785 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0.0 | 861 | 59.0% |
|
| 72.0 | 8 | 0.5% |
|
| 180.0 | 8 | 0.5% |
|
| 108.0 | 8 | 0.5% |
|
| 120.0 | 7 | 0.5% |
|
| 16.0 | 7 | 0.5% |
|
| 106.0 | 6 | 0.4% |
|
| 80.0 | 6 | 0.4% |
|
| 340.0 | 6 | 0.4% |
|
| 200.0 | 6 | 0.4% |
|
| Other values (317) | 529 | 36.2% |
|
| (Missing) | 8 | 0.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 861 | 59.0% |
|
| 1.0 | 2 | 0.1% |
|
| 11.0 | 1 | 0.1% |
|
| 14.0 | 1 | 0.1% |
|
| 16.0 | 7 | 0.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1115.0 | 1 | 0.1% |
|
| 1129.0 | 1 | 0.1% |
|
| 1170.0 | 1 | 0.1% |
|
| 1378.0 | 1 | 0.1% |
|
| 1600.0 | 1 | 0.1% |
|
MasVnrType
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.5% |
| Missing (n) | 8 |
| None |
864
|
|---|---|
| BrkFace |
445
|
| Stone |
|
| Value | Count | Frequency (%) | |
| None | 864 | 59.2% |
|
| BrkFace | 445 | 30.5% |
|
| Stone | 128 | 8.8% |
|
| BrkCmn | 15 | 1.0% |
|
| (Missing) | 8 | 0.5% |
|
MiscFeature
Categorical
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 96.3% |
| Missing (n) | 1406 |
| Shed |
|
|---|---|
| Othr |
|
| Gar2 |
|
| (Missing) |
1406
|
| Value | Count | Frequency (%) | |
| Shed | 49 | 3.4% |
|
| Othr | 2 | 0.1% |
|
| Gar2 | 2 | 0.1% |
|
| TenC | 1 | 0.1% |
|
| (Missing) | 1406 | 96.3% |
|
MiscVal
Numeric
| Distinct count | 21 |
|---|---|
| Unique (%) | 1.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 43.489 |
|---|---|
| Minimum | 0 |
| Maximum | 15500 |
| Zeros (%) | 96.4% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 0 |
| Maximum | 15500 |
| Range | 15500 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 496.12 |
|---|---|
| Coef of variation | 11.408 |
| Kurtosis | 701 |
| Mean | 43.489 |
| MAD | 83.88 |
| Skewness | 24.477 |
| Sum | 63494 |
| Variance | 246140 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1408 | 96.4% |
|
| 400 | 11 | 0.8% |
|
| 500 | 8 | 0.5% |
|
| 700 | 5 | 0.3% |
|
| 450 | 4 | 0.3% |
|
| 2000 | 4 | 0.3% |
|
| 600 | 4 | 0.3% |
|
| 1200 | 2 | 0.1% |
|
| 480 | 2 | 0.1% |
|
| 1150 | 1 | 0.1% |
|
| Other values (11) | 11 | 0.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1408 | 96.4% |
|
| 54 | 1 | 0.1% |
|
| 350 | 1 | 0.1% |
|
| 400 | 11 | 0.8% |
|
| 450 | 4 | 0.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2000 | 4 | 0.3% |
|
| 2500 | 1 | 0.1% |
|
| 3500 | 1 | 0.1% |
|
| 8300 | 1 | 0.1% |
|
| 15500 | 1 | 0.1% |
|
MoSold
Numeric
| Distinct count | 12 |
|---|---|
| Unique (%) | 0.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 6.3219 |
|---|---|
| Minimum | 1 |
| Maximum | 12 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 2 |
| Q1 | 5 |
| Median | 6 |
| Q3 | 8 |
| 95-th percentile | 11 |
| Maximum | 12 |
| Range | 11 |
| Interquartile range | 3 |
Descriptive statistics
| Standard deviation | 2.7036 |
|---|---|
| Coef of variation | 0.42766 |
| Kurtosis | -0.40411 |
| Mean | 6.3219 |
| MAD | 2.1425 |
| Skewness | 0.21205 |
| Sum | 9230 |
| Variance | 7.3096 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 6 | 253 | 17.3% |
|
| 7 | 234 | 16.0% |
|
| 5 | 204 | 14.0% |
|
| 4 | 141 | 9.7% |
|
| 8 | 122 | 8.4% |
|
| 3 | 106 | 7.3% |
|
| 10 | 89 | 6.1% |
|
| 11 | 79 | 5.4% |
|
| 9 | 63 | 4.3% |
|
| 12 | 59 | 4.0% |
|
| Other values (2) | 110 | 7.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 58 | 4.0% |
|
| 2 | 52 | 3.6% |
|
| 3 | 106 | 7.3% |
|
| 4 | 141 | 9.7% |
|
| 5 | 204 | 14.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 8 | 122 | 8.4% |
|
| 9 | 63 | 4.3% |
|
| 10 | 89 | 6.1% |
|
| 11 | 79 | 5.4% |
|
| 12 | 59 | 4.0% |
|
Neighborhood
Categorical
| Distinct count | 25 |
|---|---|
| Unique (%) | 1.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| NAmes |
225
|
|---|---|
| CollgCr |
|
| OldTown |
|
| Other values (22) |
972
|
| Value | Count | Frequency (%) | |
| NAmes | 225 | 15.4% |
|
| CollgCr | 150 | 10.3% |
|
| OldTown | 113 | 7.7% |
|
| Edwards | 100 | 6.8% |
|
| Somerst | 86 | 5.9% |
|
| Gilbert | 79 | 5.4% |
|
| NridgHt | 77 | 5.3% |
|
| Sawyer | 74 | 5.1% |
|
| NWAmes | 73 | 5.0% |
|
| SawyerW | 59 | 4.0% |
|
| Other values (15) | 424 | 29.0% |
|
OpenPorchSF
Numeric
| Distinct count | 202 |
|---|---|
| Unique (%) | 13.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 46.66 |
|---|---|
| Minimum | 0 |
| Maximum | 547 |
| Zeros (%) | 44.9% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 25 |
| Q3 | 68 |
| 95-th percentile | 175.05 |
| Maximum | 547 |
| Range | 547 |
| Interquartile range | 68 |
Descriptive statistics
| Standard deviation | 66.256 |
|---|---|
| Coef of variation | 1.42 |
| Kurtosis | 8.4903 |
| Mean | 46.66 |
| MAD | 47.678 |
| Skewness | 2.3643 |
| Sum | 68124 |
| Variance | 4389.9 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 656 | 44.9% |
|
| 36 | 29 | 2.0% |
|
| 48 | 22 | 1.5% |
|
| 20 | 21 | 1.4% |
|
| 40 | 19 | 1.3% |
|
| 45 | 19 | 1.3% |
|
| 30 | 16 | 1.1% |
|
| 24 | 16 | 1.1% |
|
| 60 | 15 | 1.0% |
|
| 39 | 14 | 1.0% |
|
| Other values (192) | 633 | 43.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 656 | 44.9% |
|
| 4 | 1 | 0.1% |
|
| 8 | 1 | 0.1% |
|
| 10 | 1 | 0.1% |
|
| 11 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 406 | 1 | 0.1% |
|
| 418 | 1 | 0.1% |
|
| 502 | 1 | 0.1% |
|
| 523 | 1 | 0.1% |
|
| 547 | 1 | 0.1% |
|
OverallCond
Numeric
| Distinct count | 9 |
|---|---|
| Unique (%) | 0.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 5.5753 |
|---|---|
| Minimum | 1 |
| Maximum | 9 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 4 |
| Q1 | 5 |
| Median | 5 |
| Q3 | 6 |
| 95-th percentile | 8 |
| Maximum | 9 |
| Range | 8 |
| Interquartile range | 1 |
Descriptive statistics
| Standard deviation | 1.1128 |
|---|---|
| Coef of variation | 0.19959 |
| Kurtosis | 1.1064 |
| Mean | 5.5753 |
| MAD | 0.88902 |
| Skewness | 0.69307 |
| Sum | 8140 |
| Variance | 1.2383 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 5 | 821 | 56.2% |
|
| 6 | 252 | 17.3% |
|
| 7 | 205 | 14.0% |
|
| 8 | 72 | 4.9% |
|
| 4 | 57 | 3.9% |
|
| 3 | 25 | 1.7% |
|
| 9 | 22 | 1.5% |
|
| 2 | 5 | 0.3% |
|
| 1 | 1 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 1 | 0.1% |
|
| 2 | 5 | 0.3% |
|
| 3 | 25 | 1.7% |
|
| 4 | 57 | 3.9% |
|
| 5 | 821 | 56.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5 | 821 | 56.2% |
|
| 6 | 252 | 17.3% |
|
| 7 | 205 | 14.0% |
|
| 8 | 72 | 4.9% |
|
| 9 | 22 | 1.5% |
|
OverallQual
Numeric
| Distinct count | 10 |
|---|---|
| Unique (%) | 0.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 6.0993 |
|---|---|
| Minimum | 1 |
| Maximum | 10 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 4 |
| Q1 | 5 |
| Median | 6 |
| Q3 | 7 |
| 95-th percentile | 8 |
| Maximum | 10 |
| Range | 9 |
| Interquartile range | 2 |
Descriptive statistics
| Standard deviation | 1.383 |
|---|---|
| Coef of variation | 0.22675 |
| Kurtosis | 0.096293 |
| Mean | 6.0993 |
| MAD | 1.098 |
| Skewness | 0.21694 |
| Sum | 8905 |
| Variance | 1.9127 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 5 | 397 | 27.2% |
|
| 6 | 374 | 25.6% |
|
| 7 | 319 | 21.8% |
|
| 8 | 168 | 11.5% |
|
| 4 | 116 | 7.9% |
|
| 9 | 43 | 2.9% |
|
| 3 | 20 | 1.4% |
|
| 10 | 18 | 1.2% |
|
| 2 | 3 | 0.2% |
|
| 1 | 2 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 2 | 0.1% |
|
| 2 | 3 | 0.2% |
|
| 3 | 20 | 1.4% |
|
| 4 | 116 | 7.9% |
|
| 5 | 397 | 27.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 6 | 374 | 25.6% |
|
| 7 | 319 | 21.8% |
|
| 8 | 168 | 11.5% |
|
| 9 | 43 | 2.9% |
|
| 10 | 18 | 1.2% |
|
PavedDrive
Categorical
| Distinct count | 3 |
|---|---|
| Unique (%) | 0.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Y |
1340
|
|---|---|
| N |
|
| P |
|
| Value | Count | Frequency (%) | |
| Y | 1340 | 91.8% |
|
| N | 90 | 6.2% |
|
| P | 30 | 2.1% |
|
PoolArea
Numeric
| Distinct count | 8 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2.7589 |
|---|---|
| Minimum | 0 |
| Maximum | 738 |
| Zeros (%) | 99.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 0 |
| Maximum | 738 |
| Range | 738 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 40.177 |
|---|---|
| Coef of variation | 14.563 |
| Kurtosis | 223.27 |
| Mean | 2.7589 |
| MAD | 5.4914 |
| Skewness | 14.828 |
| Sum | 4028 |
| Variance | 1614.2 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1453 | 99.5% |
|
| 738 | 1 | 0.1% |
|
| 648 | 1 | 0.1% |
|
| 576 | 1 | 0.1% |
|
| 555 | 1 | 0.1% |
|
| 519 | 1 | 0.1% |
|
| 512 | 1 | 0.1% |
|
| 480 | 1 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1453 | 99.5% |
|
| 480 | 1 | 0.1% |
|
| 512 | 1 | 0.1% |
|
| 519 | 1 | 0.1% |
|
| 555 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 519 | 1 | 0.1% |
|
| 555 | 1 | 0.1% |
|
| 576 | 1 | 0.1% |
|
| 648 | 1 | 0.1% |
|
| 738 | 1 | 0.1% |
|
PoolQC
Categorical
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 99.5% |
| Missing (n) | 1453 |
| Gd |
|
|---|---|
| Fa |
|
| Ex |
|
| (Missing) |
1453
|
| Value | Count | Frequency (%) | |
| Gd | 3 | 0.2% |
|
| Fa | 2 | 0.1% |
|
| Ex | 2 | 0.1% |
|
| (Missing) | 1453 | 99.5% |
|
RoofMatl
Categorical
| Distinct count | 8 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| CompShg |
1434
|
|---|---|
| Tar&Grv |
|
| WdShngl |
|
| Other values (5) |
|
| Value | Count | Frequency (%) | |
| CompShg | 1434 | 98.2% |
|
| Tar&Grv | 11 | 0.8% |
|
| WdShngl | 6 | 0.4% |
|
| WdShake | 5 | 0.3% |
|
| Roll | 1 | 0.1% |
|
| ClyTile | 1 | 0.1% |
|
| Metal | 1 | 0.1% |
|
| Membran | 1 | 0.1% |
|
RoofStyle
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Gable |
1141
|
|---|---|
| Hip |
286
|
| Flat |
|
| Other values (3) |
|
| Value | Count | Frequency (%) | |
| Gable | 1141 | 78.2% |
|
| Hip | 286 | 19.6% |
|
| Flat | 13 | 0.9% |
|
| Gambrel | 11 | 0.8% |
|
| Mansard | 7 | 0.5% |
|
| Shed | 2 | 0.1% |
|
SaleCondition
Categorical
| Distinct count | 6 |
|---|---|
| Unique (%) | 0.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Normal |
1198
|
|---|---|
| Partial |
|
| Abnorml |
|
| Other values (3) |
|
| Value | Count | Frequency (%) | |
| Normal | 1198 | 82.1% |
|
| Partial | 125 | 8.6% |
|
| Abnorml | 101 | 6.9% |
|
| Family | 20 | 1.4% |
|
| Alloca | 12 | 0.8% |
|
| AdjLand | 4 | 0.3% |
|
SalePrice
Numeric
| Distinct count | 663 |
|---|---|
| Unique (%) | 45.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 180920 |
|---|---|
| Minimum | 34900 |
| Maximum | 755000 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 34900 |
|---|---|
| 5-th percentile | 88000 |
| Q1 | 129980 |
| Median | 163000 |
| Q3 | 214000 |
| 95-th percentile | 326100 |
| Maximum | 755000 |
| Range | 720100 |
| Interquartile range | 84025 |
Descriptive statistics
| Standard deviation | 79443 |
|---|---|
| Coef of variation | 0.4391 |
| Kurtosis | 6.5363 |
| Mean | 180920 |
| MAD | 57435 |
| Skewness | 1.8829 |
| Sum | 264144946 |
| Variance | 6311100000 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 140000 | 20 | 1.4% |
|
| 135000 | 17 | 1.2% |
|
| 145000 | 14 | 1.0% |
|
| 155000 | 14 | 1.0% |
|
| 190000 | 13 | 0.9% |
|
| 110000 | 13 | 0.9% |
|
| 160000 | 12 | 0.8% |
|
| 115000 | 12 | 0.8% |
|
| 139000 | 11 | 0.8% |
|
| 130000 | 11 | 0.8% |
|
| Other values (653) | 1323 | 90.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 34900 | 1 | 0.1% |
|
| 35311 | 1 | 0.1% |
|
| 37900 | 1 | 0.1% |
|
| 39300 | 1 | 0.1% |
|
| 40000 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 582933 | 1 | 0.1% |
|
| 611657 | 1 | 0.1% |
|
| 625000 | 1 | 0.1% |
|
| 745000 | 1 | 0.1% |
|
| 755000 | 1 | 0.1% |
|
SaleType
Categorical
| Distinct count | 9 |
|---|---|
| Unique (%) | 0.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| WD |
1267
|
|---|---|
| New |
|
| COD |
|
| Other values (6) |
|
| Value | Count | Frequency (%) | |
| WD | 1267 | 86.8% |
|
| New | 122 | 8.4% |
|
| COD | 43 | 2.9% |
|
| ConLD | 9 | 0.6% |
|
| ConLI | 5 | 0.3% |
|
| ConLw | 5 | 0.3% |
|
| CWD | 4 | 0.3% |
|
| Oth | 3 | 0.2% |
|
| Con | 2 | 0.1% |
|
ScreenPorch
Numeric
| Distinct count | 76 |
|---|---|
| Unique (%) | 5.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.061 |
|---|---|
| Minimum | 0 |
| Maximum | 480 |
| Zeros (%) | 92.1% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 0 |
| 95-th percentile | 160 |
| Maximum | 480 |
| Range | 480 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 55.757 |
|---|---|
| Coef of variation | 3.7021 |
| Kurtosis | 18.439 |
| Mean | 15.061 |
| MAD | 27.729 |
| Skewness | 4.1222 |
| Sum | 21989 |
| Variance | 3108.9 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 1344 | 92.1% |
|
| 192 | 6 | 0.4% |
|
| 224 | 5 | 0.3% |
|
| 120 | 5 | 0.3% |
|
| 189 | 4 | 0.3% |
|
| 180 | 4 | 0.3% |
|
| 160 | 3 | 0.2% |
|
| 168 | 3 | 0.2% |
|
| 144 | 3 | 0.2% |
|
| 126 | 3 | 0.2% |
|
| Other values (66) | 80 | 5.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 1344 | 92.1% |
|
| 40 | 1 | 0.1% |
|
| 53 | 1 | 0.1% |
|
| 60 | 1 | 0.1% |
|
| 63 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 385 | 1 | 0.1% |
|
| 396 | 1 | 0.1% |
|
| 410 | 1 | 0.1% |
|
| 440 | 1 | 0.1% |
|
| 480 | 1 | 0.1% |
|
Street
Categorical
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Pave |
1454
|
|---|---|
| Grvl |
|
| Value | Count | Frequency (%) | |
| Pave | 1454 | 99.6% |
|
| Grvl | 6 | 0.4% |
|
TotRmsAbvGrd
Numeric
| Distinct count | 12 |
|---|---|
| Unique (%) | 0.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 6.5178 |
|---|---|
| Minimum | 2 |
| Maximum | 14 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2 |
|---|---|
| 5-th percentile | 4 |
| Q1 | 5 |
| Median | 6 |
| Q3 | 7 |
| 95-th percentile | 10 |
| Maximum | 14 |
| Range | 12 |
| Interquartile range | 2 |
Descriptive statistics
| Standard deviation | 1.6254 |
|---|---|
| Coef of variation | 0.24938 |
| Kurtosis | 0.88076 |
| Mean | 6.5178 |
| MAD | 1.2796 |
| Skewness | 0.67634 |
| Sum | 9516 |
| Variance | 2.6419 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 6 | 402 | 27.5% |
|
| 7 | 329 | 22.5% |
|
| 5 | 275 | 18.8% |
|
| 8 | 187 | 12.8% |
|
| 4 | 97 | 6.6% |
|
| 9 | 75 | 5.1% |
|
| 10 | 47 | 3.2% |
|
| 11 | 18 | 1.2% |
|
| 3 | 17 | 1.2% |
|
| 12 | 11 | 0.8% |
|
| Other values (2) | 2 | 0.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2 | 1 | 0.1% |
|
| 3 | 17 | 1.2% |
|
| 4 | 97 | 6.6% |
|
| 5 | 275 | 18.8% |
|
| 6 | 402 | 27.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 9 | 75 | 5.1% |
|
| 10 | 47 | 3.2% |
|
| 11 | 18 | 1.2% |
|
| 12 | 11 | 0.8% |
|
| 14 | 1 | 0.1% |
|
TotalBsmtSF
Numeric
| Distinct count | 721 |
|---|---|
| Unique (%) | 49.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1057.4 |
|---|---|
| Minimum | 0 |
| Maximum | 6110 |
| Zeros (%) | 2.5% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 519.3 |
| Q1 | 795.75 |
| Median | 991.5 |
| Q3 | 1298.2 |
| 95-th percentile | 1753 |
| Maximum | 6110 |
| Range | 6110 |
| Interquartile range | 502.5 |
Descriptive statistics
| Standard deviation | 438.71 |
|---|---|
| Coef of variation | 0.41488 |
| Kurtosis | 13.25 |
| Mean | 1057.4 |
| MAD | 321.28 |
| Skewness | 1.5243 |
| Sum | 1543847 |
| Variance | 192460 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 37 | 2.5% |
|
| 864 | 35 | 2.4% |
|
| 672 | 17 | 1.2% |
|
| 912 | 15 | 1.0% |
|
| 1040 | 14 | 1.0% |
|
| 816 | 13 | 0.9% |
|
| 728 | 12 | 0.8% |
|
| 768 | 12 | 0.8% |
|
| 848 | 11 | 0.8% |
|
| 780 | 11 | 0.8% |
|
| Other values (711) | 1283 | 87.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 37 | 2.5% |
|
| 105 | 1 | 0.1% |
|
| 190 | 1 | 0.1% |
|
| 264 | 3 | 0.2% |
|
| 270 | 1 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3094 | 1 | 0.1% |
|
| 3138 | 1 | 0.1% |
|
| 3200 | 1 | 0.1% |
|
| 3206 | 1 | 0.1% |
|
| 6110 | 1 | 0.1% |
|
Utilities
Categorical
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| AllPub |
1459
|
|---|---|
| NoSeWa |
|
| Value | Count | Frequency (%) | |
| AllPub | 1459 | 99.9% |
|
| NoSeWa | 1 | 0.1% |
|
WoodDeckSF
Numeric
| Distinct count | 274 |
|---|---|
| Unique (%) | 18.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 94.245 |
|---|---|
| Minimum | 0 |
| Maximum | 857 |
| Zeros (%) | 52.1% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 0 |
| Q3 | 168 |
| 95-th percentile | 335 |
| Maximum | 857 |
| Range | 857 |
| Interquartile range | 168 |
Descriptive statistics
| Standard deviation | 125.34 |
|---|---|
| Coef of variation | 1.3299 |
| Kurtosis | 2.993 |
| Mean | 94.245 |
| MAD | 102 |
| Skewness | 1.5414 |
| Sum | 137597 |
| Variance | 15710 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 0 | 761 | 52.1% |
|
| 192 | 38 | 2.6% |
|
| 100 | 36 | 2.5% |
|
| 144 | 33 | 2.3% |
|
| 120 | 31 | 2.1% |
|
| 168 | 28 | 1.9% |
|
| 140 | 15 | 1.0% |
|
| 224 | 14 | 1.0% |
|
| 240 | 10 | 0.7% |
|
| 208 | 10 | 0.7% |
|
| Other values (264) | 484 | 33.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 761 | 52.1% |
|
| 12 | 2 | 0.1% |
|
| 24 | 2 | 0.1% |
|
| 26 | 2 | 0.1% |
|
| 28 | 2 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 668 | 1 | 0.1% |
|
| 670 | 1 | 0.1% |
|
| 728 | 1 | 0.1% |
|
| 736 | 1 | 0.1% |
|
| 857 | 1 | 0.1% |
|
YearBuilt
Numeric
| Distinct count | 112 |
|---|---|
| Unique (%) | 7.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1971.3 |
|---|---|
| Minimum | 1872 |
| Maximum | 2010 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1872 |
|---|---|
| 5-th percentile | 1916 |
| Q1 | 1954 |
| Median | 1973 |
| Q3 | 2000 |
| 95-th percentile | 2007 |
| Maximum | 2010 |
| Range | 138 |
| Interquartile range | 46 |
Descriptive statistics
| Standard deviation | 30.203 |
|---|---|
| Coef of variation | 0.015322 |
| Kurtosis | -0.43955 |
| Mean | 1971.3 |
| MAD | 25.067 |
| Skewness | -0.61346 |
| Sum | 2878051 |
| Variance | 912.22 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 2006 | 67 | 4.6% |
|
| 2005 | 64 | 4.4% |
|
| 2004 | 54 | 3.7% |
|
| 2007 | 49 | 3.4% |
|
| 2003 | 45 | 3.1% |
|
| 1976 | 33 | 2.3% |
|
| 1977 | 32 | 2.2% |
|
| 1920 | 30 | 2.1% |
|
| 1959 | 26 | 1.8% |
|
| 1999 | 25 | 1.7% |
|
| Other values (102) | 1035 | 70.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1872 | 1 | 0.1% |
|
| 1875 | 1 | 0.1% |
|
| 1880 | 4 | 0.3% |
|
| 1882 | 1 | 0.1% |
|
| 1885 | 2 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2006 | 67 | 4.6% |
|
| 2007 | 49 | 3.4% |
|
| 2008 | 23 | 1.6% |
|
| 2009 | 18 | 1.2% |
|
| 2010 | 1 | 0.1% |
|
YearRemodAdd
Numeric
| Distinct count | 61 |
|---|---|
| Unique (%) | 4.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1984.9 |
|---|---|
| Minimum | 1950 |
| Maximum | 2010 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1950 |
|---|---|
| 5-th percentile | 1950 |
| Q1 | 1967 |
| Median | 1994 |
| Q3 | 2004 |
| 95-th percentile | 2007 |
| Maximum | 2010 |
| Range | 60 |
| Interquartile range | 37 |
Descriptive statistics
| Standard deviation | 20.645 |
|---|---|
| Coef of variation | 0.010401 |
| Kurtosis | -1.2722 |
| Mean | 1984.9 |
| MAD | 18.623 |
| Skewness | -0.50356 |
| Sum | 2897904 |
| Variance | 426.23 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 1950 | 178 | 12.2% |
|
| 2006 | 97 | 6.6% |
|
| 2007 | 76 | 5.2% |
|
| 2005 | 73 | 5.0% |
|
| 2004 | 62 | 4.2% |
|
| 2000 | 55 | 3.8% |
|
| 2003 | 51 | 3.5% |
|
| 2002 | 48 | 3.3% |
|
| 2008 | 40 | 2.7% |
|
| 1996 | 36 | 2.5% |
|
| Other values (51) | 744 | 51.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1950 | 178 | 12.2% |
|
| 1951 | 4 | 0.3% |
|
| 1952 | 5 | 0.3% |
|
| 1953 | 10 | 0.7% |
|
| 1954 | 14 | 1.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2006 | 97 | 6.6% |
|
| 2007 | 76 | 5.2% |
|
| 2008 | 40 | 2.7% |
|
| 2009 | 23 | 1.6% |
|
| 2010 | 6 | 0.4% |
|
YrSold
Numeric
| Distinct count | 5 |
|---|---|
| Unique (%) | 0.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2007.8 |
|---|---|
| Minimum | 2006 |
| Maximum | 2010 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2006 |
|---|---|
| 5-th percentile | 2006 |
| Q1 | 2007 |
| Median | 2008 |
| Q3 | 2009 |
| 95-th percentile | 2010 |
| Maximum | 2010 |
| Range | 4 |
| Interquartile range | 2 |
Descriptive statistics
| Standard deviation | 1.3281 |
|---|---|
| Coef of variation | 0.00066146 |
| Kurtosis | -1.1906 |
| Mean | 2007.8 |
| MAD | 1.1487 |
| Skewness | 0.096269 |
| Sum | 2931411 |
| Variance | 1.7638 |
| Memory size | 11.5 KiB |
| Value | Count | Frequency (%) | |
| 2009 | 338 | 23.2% |
|
| 2007 | 329 | 22.5% |
|
| 2006 | 314 | 21.5% |
|
| 2008 | 304 | 20.8% |
|
| 2010 | 175 | 12.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2006 | 314 | 21.5% |
|
| 2007 | 329 | 22.5% |
|
| 2008 | 304 | 20.8% |
|
| 2009 | 338 | 23.2% |
|
| 2010 | 175 | 12.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2006 | 314 | 21.5% |
|
| 2007 | 329 | 22.5% |
|
| 2008 | 304 | 20.8% |
|
| 2009 | 338 | 23.2% |
|
| 2010 | 175 | 12.0% |
|
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
A description of all data fields can be found on the Kaggle site: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data. Most of them are self-explanatory though.
Around half the variables are categorical and the other half are numerical. For the categorical variables there will be need to use hot-one-encoding for incorporating them into the prediction models.
We can already spot some correlations that look promising. Some of them also are also expected and won't give us further insights. We are particularly interested in correlations with our target variable
Also, we can detect variables that probably won't be of much use e.g.
df.select_dtypes(include=['object']).head()
| MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1 | RL | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 2 | RL | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 3 | RL | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | ... | Detchd | Unf | TA | TA | Y | NaN | NaN | NaN | WD | Abnorml |
| 4 | RL | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
5 rows × 43 columns
Let's check the missingness in more detail
display(na_ratio_table(df)[na_ratio_table(df)["NA_COUNT"]>0])
display(na_ratio_table(df)[na_ratio_table(df)["NA_COUNT"]>0].shape)
| NA_COUNT | NA_RATIO_PERC | |
|---|---|---|
| LotFrontage | 259 | 17.739726 |
| Alley | 1369 | 93.767123 |
| MasVnrType | 8 | 0.547945 |
| MasVnrArea | 8 | 0.547945 |
| BsmtQual | 37 | 2.534247 |
| BsmtCond | 37 | 2.534247 |
| BsmtExposure | 38 | 2.602740 |
| BsmtFinType1 | 37 | 2.534247 |
| BsmtFinType2 | 38 | 2.602740 |
| Electrical | 1 | 0.068493 |
| FireplaceQu | 690 | 47.260274 |
| GarageType | 81 | 5.547945 |
| GarageYrBlt | 81 | 5.547945 |
| GarageFinish | 81 | 5.547945 |
| GarageQual | 81 | 5.547945 |
| GarageCond | 81 | 5.547945 |
| PoolQC | 1453 | 99.520548 |
| Fence | 1179 | 80.753425 |
| MiscFeature | 1406 | 96.301370 |
(19, 2)
We have 19 variables that contain missing values. Most of them mean that the feature is simply not available for that property. However for a few, this can indicate a data quality issue:
As for "Electrical" only one record is missing, we can simply filter this out, or even ignore this. For the LotFrontage we can apply some imputation-techniques if necessary.
# we test if the data is randomly missing, or if there are some patterns in the missingness
# this helps us indicate whether there are data quality issues or if the missingness is part of the data
msno.heatmap(df)
<matplotlib.axes._subplots.AxesSubplot at 0x1a34fde3438>
As expected, we can see that some of the variables are always missing together, which makes absolutely sense.
Example: All Garage related variables are always missing together. Reason: no garage -> no values for any garage features.
The other group of variables missing together is related to the basement. Because of these correlations, we might run into some multicollinearity issues in the modelling part later on. "Multicollinearity is a state of very high intercorrelations or inter-associations among the independent variables. It is therefore a type of disturbance in the data, and if present in the data the statistical inferences made about the data may not be reliable." (https://www.statisticssolutions.com/multicollinearity/, Accesed on: 22.11.2018)
We're going to check if there are some inconsistencies in the data or duplicates, etc. (Quality assessment)
# any duplicates?
df[df.duplicated(keep=False)]
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice |
|---|
0 rows × 81 columns
# any built year before sold year?
df.query('YearBuilt > YrSold')
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice |
|---|
0 rows × 81 columns
# remove useless columns
df=df.drop(columns=["Id", "Street", "Utilities"])
# remove the missing record for Electrical
df=df[df["Electrical"].isna()==False]
df.describe()
| MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1459.000000 | 1200.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1451.00000 | 1459.000000 | 1459.000000 | ... | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 | 1459.000000 |
| mean | 56.881426 | 70.047500 | 10517.363948 | 6.100069 | 5.575737 | 1971.244003 | 1984.850583 | 103.75672 | 443.943797 | 46.581220 | ... | 94.240576 | 46.692255 | 21.969157 | 3.411926 | 15.071282 | 2.760795 | 43.518849 | 6.322824 | 2007.815627 | 180930.394791 |
| std | 42.310746 | 24.294727 | 9984.666267 | 1.383171 | 1.113079 | 30.199555 | 20.644343 | 181.10815 | 456.106417 | 161.369977 | ... | 125.381679 | 66.267472 | 61.137400 | 29.327247 | 55.775138 | 40.191018 | 496.291826 | 2.704331 | 1.328542 | 79468.964025 |
| min | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.00000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
| 25% | 20.000000 | 59.000000 | 7549.000000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.00000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129950.000000 |
| 50% | 50.000000 | 69.000000 | 9477.000000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.00000 | 384.000000 | 0.000000 | ... | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
| 75% | 70.000000 | 80.000000 | 11603.000000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.00000 | 712.500000 | 0.000000 | ... | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
| max | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.00000 | 5644.000000 | 1474.000000 | ... | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
8 rows × 37 columns
Let's once again check the distribution of the SalePrice
sns.distplot(df["SalePrice"], hist=False, label="SalePrice", fit=norm, kde_kws={"shade": True})
<matplotlib.axes._subplots.AxesSubplot at 0x1a350680d68>
The distribution is slightly skewed.
stats.probplot(df['SalePrice'], plot=plt)
((array([-3.30494753, -3.04772643, -2.90468264, ..., 2.90468264,
3.04772643, 3.30494753]),
array([ 34900, 35311, 37900, ..., 625000, 745000, 755000], dtype=int64)),
(74187.860388132, 180930.39479095273, 0.9320032714937404))
SalePrice does not have a normal distribution.
# we check for further correlations using different plots
NUM_FEATURES =df.select_dtypes(include=[np.number]).columns.tolist()
df_num=df[NUM_FEATURES]
df_corr=df_num.corr()
corr_heatmap(df_corr, figsize=(20, 16))
corr_matrix_1(df_corr)
<seaborn.axisgrid.PairGrid at 0x1a34fd64be0>
This visualization is not suitable due to the great amount of variables. Let's filter out the ones with the highest correlations and visualize those.
df_corr=df_corr.abs()
df_corr=(df_corr.where(~np.tril(np.ones(df_corr.shape)).astype(np.bool)))
df_high_corr=(df_corr[df_corr>0.75].dropna(how="all", axis=0).dropna(how="all", axis=1))
df_high_corr_vars=np.unique(np.concatenate((df_high_corr.columns.values, df_high_corr.index.values)))
df_high_corr = df[df_high_corr_vars]
df_high_corr.shape
(1459, 10)
corr_matrix_1(df_high_corr)
<seaborn.axisgrid.PairGrid at 0x1a303f0ec18>
df_corr.iloc[:,-1].sort_values(ascending=False)[:10]
OverallQual 0.791069 GrLivArea 0.708618 GarageCars 0.640473 GarageArea 0.623423 TotalBsmtSF 0.613905 1stFlrSF 0.605968 FullBath 0.560881 TotRmsAbvGrd 0.533779 YearBuilt 0.523273 YearRemodAdd 0.507430 Name: SalePrice, dtype: float64
The variables which seem to have a high correlation with the target variable are also amongst the ones with the highest correlations in general. Let's also have a look at some of the categorical variables.
CAT_FEATURES =df.select_dtypes(include="object").columns.tolist()
plt.figure(figsize=(40,400))
for i, a in enumerate(CAT_FEATURES):
# plt.figure(figsize=(5,2))
plt.subplot(math.ceil(len(CAT_FEATURES)),2,((i+1)*2-1))
sns.boxplot(x="SalePrice", y=a, data=df)
plt.ylabel(a, fontsize=40)
plt.tick_params(axis='y', which='major', labelsize=30)
plt.subplot(math.ceil(len(CAT_FEATURES)),2,((i+1)*2))
for b in df[a].unique():
sns.distplot(df[df[a]==b]["SalePrice"], hist=False, label=a, kde_kws={"shade": True})
# plt.ylabel(a, fontsize=40)
# plt.tight_layout()
# sns.distplot(tsh_sta_pcu.query('Label_trunc == "PCU1"')['Wdf'], hist=False, label='PCU1', kde_kws={"shade": True})
# sns.distplot(tsh_sta_pcu.query('Label_trunc == "PCU2"')['Wdf'], hist=False, label='PCU2', kde_kws={"shade": True});
This gives us a feeling of which attributes might be more important and can help us predict the SalePrice. For these attributes for example the SalePrice is different based on the value:
For BsmtFinType2 e.g. the SalePrice cannot be differentiated much. Hence, this attribute might have less predictive power compared to some of the other attriutes.
The EDA here already shows us potential to predict the SalePrice. Definitively, there are some relationship towards the SalePrice. Next step is to built models to predict the SalePrice and investigate the significance of differences in the attributes.
As mentioned in the beginning, we will also divide the SalePrice in three categories, namely "low", "middle" and "upper class". Thus, we also want to see how these three can be separated and if there is some obvious differences.
sns.distplot(df["SalePrice"], hist=True, label="SalePrice", fit=norm, kde_kws={"shade": True})
<matplotlib.axes._subplots.AxesSubplot at 0x1a333b33320>
df["SalePrice"].describe()
count 1459.000000 mean 180930.394791 std 79468.964025 min 34900.000000 25% 129950.000000 50% 163000.000000 75% 214000.000000 max 755000.000000 Name: SalePrice, dtype: float64
df["SalePrice"].median()
163000.0
plt.hist(df.SalePrice, bins=10)
(array([148., 722., 373., 135., 51., 19., 4., 3., 2., 2.]),
array([ 34900., 106910., 178920., 250930., 322940., 394950., 466960.,
538970., 610980., 682990., 755000.]),
<a list of 10 Patch objects>)
The range of the SalePrice is from 34,900 - 755,000. The middle class would be the most frequent one. We decided to use the following boundaries:
# create new column
def price_class(row):
if row["SalePrice"]<=120000:
return "low"
elif (row["SalePrice"]>120000)& (row["SalePrice"]<=250000):
return "middle"
else:
return"upper"
df["Price_Class"]= df.apply(lambda row: price_class(row), axis=1)
plt.figure(figsize=(40,400))
for i, a in enumerate(NUM_FEATURES):
# plt.figure(figsize=(5,2))
plt.subplot(math.ceil(len(CAT_FEATURES)),2,((i+1)*2-1))
sns.boxplot(y=a, x="Price_Class", data=df)
plt.ylabel(a, fontsize=40)
plt.tick_params(axis='y', which='major', labelsize=30)
plt.subplot(math.ceil(len(CAT_FEATURES)),2,((i+1)*2))
for b in df["Price_Class"].unique():
sns.distplot(df[df["Price_Class"]==b][a], hist=False, label=b, kde_kws={"shade": True})
# plt.ylabel(a, fontsize=40)
# plt.tight_layout()
# sns.distplot(tsh_sta_pcu.query('Label_trunc == "PCU1"')['Wdf'], hist=False, label='PCU1', kde_kws={"shade": True})
# sns.distplot(tsh_sta_pcu.query('Label_trunc == "PCU2"')['Wdf'], hist=False, label='PCU2', kde_kws={"shade": True});
For some of the numeric features we can already see some very distinguishable boundaries between the classes. E.g. GrdLivArea and GarageArea. These look like promising attributes. Some others like MoSold and BsmtUnfinSF are completely overlapping and will probably not help much in the predictions.